library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──
✔ broom        1.0.8     ✔ rsample      1.3.0
✔ dials        1.4.0     ✔ tune         1.3.0
✔ infer        1.0.8     ✔ workflows    1.2.0
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.3.1     ✔ yardstick    1.3.2
✔ recipes      1.3.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
root  <- 'https://gdex.ucar.edu/dataset/camels/file'

download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')

types <- c("clim", "geol", "soil", "topo", "vege", "hydro")

# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')

walk2(remote_files, local_files, download.file, quiet = TRUE)

# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 

camels <- power_full_join(camels ,by = 'gauge_id')
#### QUESTION 1

library(ggplot2)


ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

##### Zero_q_freq indicates the frequency of days where Q = 0 mm/day.
##### QUESTION 2


aridity_map <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
   borders("state", colour = "gray50") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "red", high = "purple") +
  labs(color = "Priestly-Taylor Aridity Index",
       x = "Longitude",
       y = "Latitude")+
  ggthemes::theme_map()


rainfall_map <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
   borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "green", high = "blue") +
  labs(color = "Rainfall",
       x = "Longitude",
       y = "Latitude") +
  ggthemes::theme_map()

library(patchwork)

combined_map <- aridity_map + rainfall_map  
print(combined_map)

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000
# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)




# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())


# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)


metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
# From the base implementation
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
#
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61
metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 

rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60
metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.587
2 rsq     standard       0.741
3 mae     standard       0.363
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.563  0.0247    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.771  0.0259    10 recipe       rand…     1
3 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     2
4 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     2
aridity_map

rainfall_map

#### QUESTION 3


library(xgboost)

Attaching package: 'xgboost'
The following object is masked from 'package:dplyr':

    slice
xgboost_model <- boost_tree() %>%
  set_engine("xgboost") %>%
set_mode("regression")

nn_model <- bag_mlp() %>%
  set_engine("nnet") %>%
  set_mode("regression")
  
  wf <- workflow_set(list(rec), list(lm_model, rf_model, xgboost_model,
                                     nn_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv)
  
  autoplot(wf)

  rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bag_mlp    Prepro… rmse    0.547  0.0308    10 recipe       bag_…     1
2 recipe_bag_mlp    Prepro… rsq     0.787  0.0266    10 recipe       bag_…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0247    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.770  0.0258    10 recipe       rand…     2
5 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_boost_tree Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_boost_tree Prepro… rsq     0.745  0.0268    10 recipe       boos…     4
  #### I think the neural network model is the best because it has the highest R-squared and 
  ### the lowest RMSE value
#### QUESTION 4

##data splitting
set.seed(555)
camels_select <- camels |>
  mutate(logQmean = log(q_mean + 1)) |>
  select(logQmean, aridity, p_mean, pet_mean, p_seasonality) %>% 
  drop_na()

camels_split <- initial_split(camels_select, prop = 0.75)
camels_training <- training(camels_split)
camels_testing <- testing(camels_split)

camels_folds4 <- vfold_cv(camels_training, v = 10)
##recipe
second_rec <- recipe(logQmean ~ ., data = camels_training) |>
  #step_log(all_predictors()) |>
  step_normalize(all_predictors()) |>
  step_interact(terms = ~ aridity:p_mean)

## I chose this formula because the selected data shows the relationship best

rand_model <- rand_forest() |>
  set_engine("ranger") |>
  set_mode("regression")

linear_model <- linear_reg() |>
  set_engine("lm") |>
  set_mode("regression")

decision_model <- decision_tree() |>
  set_engine("rpart") |>
  set_mode("regression")
##workflow set
wf4 <- workflow_set(list(second_rec), list(rand_model, linear_model, decision_model)) |>
  workflow_map('fit_resamples', resamples = camels_folds4)
→ A | warning: prediction from rank-deficient fit; consider predict(., rankdeficient="NA")
There were issues with some computations   A: x1
There were issues with some computations   A: x10
## evaluate

autoplot(wf4)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 8 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_bag_mlp    Prepro… rmse    0.547  0.0308    10 recipe       bag_…     1
2 recipe_bag_mlp    Prepro… rsq     0.787  0.0266    10 recipe       bag_…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0247    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.770  0.0258    10 recipe       rand…     2
5 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     3
7 recipe_boost_tree Prepro… rmse    0.600  0.0289    10 recipe       boos…     4
8 recipe_boost_tree Prepro… rsq     0.745  0.0268    10 recipe       boos…     4
##I think the random forest model is the best because of the high rsq value

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rand_model) %>%
  fit(data = camels_train) 

data <- augment(rf_wf, new_data = camels_test)

ggplot(data, aes(x = logQmean, y = .pred, colour = aridity)) +
  geom_point() + 
  scale_color_continuous() +
  labs(
    title = "Observed vs. Predicted Values", color = "blue"
  )

## I think the plot shows that it is a decent model due to the positive slope and values that are close to the 1:1 line